# MVA Midterm Exam
# Name: Sarjak Maniar
# Email: sm2732@scarletmail.rutgers.edu

library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(corrplot)
## corrplot 0.92 loaded
library(cluster)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(devtools)
## Loading required package: usethis
library(readr)
library(psych)


# The accompanied dataset gives estimates of the average protein consumption in grams per 
# person per day) from different food sources for the inhabitants of 25 European countries.

## Importing Data and initial analysis
## Importing csv file

Protein_Consumption <- read.csv("/Users/sarju/Desktop/MITA Sem 2/MVA/MVA_Midterm/Protein_Consumption.csv")
Protein_Consumption
##           Country Red.Meat White.Meat Egg Milk Fish Cereals Starchy.Foods
## 1         Albania       10          1   1    9    0      42             1
## 2         Austria        9         14   4   20    2      28             4
## 3         Belgium       14          9   4   18    5      27             6
## 4        Bulgaria        8          6   2    8    1      57             1
## 5  Czechoslovakia       10         11   3   13    2      34             5
## 6         Denmark       11         11   4   25   10      22             5
## 7    East Germany        8         12   4   11    5      25             7
## 8         Finland       10          5   3   34    6      26             5
## 9          France       18         10   3   20    6      28             5
## 10         Greece       10          3   3   18    6      42             2
## 11        Hungary        5         12   3   10    0      40             4
## 12        Ireland       14         10   5   26    2      24             6
## 13          Italy        9          5   3   14    3      37             2
## 14    Netherlands       10         14   4   23    3      22             4
## 15         Norway        9          5   3   23   10      23             5
## 16         Poland        7         10   3   19    3      36             6
## 17       Portugal        6          4   1    5   14      27             6
## 18        Romania        6          6   2   11    1      50             3
## 19          Spain        7          3   3    9    7      29             6
## 20         Sweden       10          8   4   25    8      20             4
## 21    Switzerland       13         10   3   24    2      26             3
## 22 United Kingdom       17          6   5   21    4      24             5
## 23           USSR        9          5   2   17    3      44             6
## 24   West Germany       11         13   4   19    3      19             5
## 25     Yugoslavia        4          5   1   10    1      56             3
##    Pulses.Nuts.and.Oilseeds Fruits.and.Vegetables Total
## 1                         6                     2    72
## 2                         1                     4    86
## 3                         2                     4    89
## 4                         4                     4    91
## 5                         1                     4    83
## 6                         1                     2    91
## 7                         1                     4    77
## 8                         1                     1    91
## 9                         2                     7    99
## 10                        8                     7    99
## 11                        5                     4    83
## 12                        2                     3    92
## 13                        4                     7    84
## 14                        2                     4    86
## 15                        2                     3    83
## 16                        2                     7    93
## 17                        5                     8    76
## 18                        5                     3    87
## 19                        6                     7    77
## 20                        1                     2    82
## 21                        2                     5    88
## 22                        3                     3    88
## 23                        3                     3    92
## 24                        2                     4    80
## 25                        6                     3    89
Protein_Consumption <- as.data.frame(Protein_Consumption)
Protein_Consumption
##           Country Red.Meat White.Meat Egg Milk Fish Cereals Starchy.Foods
## 1         Albania       10          1   1    9    0      42             1
## 2         Austria        9         14   4   20    2      28             4
## 3         Belgium       14          9   4   18    5      27             6
## 4        Bulgaria        8          6   2    8    1      57             1
## 5  Czechoslovakia       10         11   3   13    2      34             5
## 6         Denmark       11         11   4   25   10      22             5
## 7    East Germany        8         12   4   11    5      25             7
## 8         Finland       10          5   3   34    6      26             5
## 9          France       18         10   3   20    6      28             5
## 10         Greece       10          3   3   18    6      42             2
## 11        Hungary        5         12   3   10    0      40             4
## 12        Ireland       14         10   5   26    2      24             6
## 13          Italy        9          5   3   14    3      37             2
## 14    Netherlands       10         14   4   23    3      22             4
## 15         Norway        9          5   3   23   10      23             5
## 16         Poland        7         10   3   19    3      36             6
## 17       Portugal        6          4   1    5   14      27             6
## 18        Romania        6          6   2   11    1      50             3
## 19          Spain        7          3   3    9    7      29             6
## 20         Sweden       10          8   4   25    8      20             4
## 21    Switzerland       13         10   3   24    2      26             3
## 22 United Kingdom       17          6   5   21    4      24             5
## 23           USSR        9          5   2   17    3      44             6
## 24   West Germany       11         13   4   19    3      19             5
## 25     Yugoslavia        4          5   1   10    1      56             3
##    Pulses.Nuts.and.Oilseeds Fruits.and.Vegetables Total
## 1                         6                     2    72
## 2                         1                     4    86
## 3                         2                     4    89
## 4                         4                     4    91
## 5                         1                     4    83
## 6                         1                     2    91
## 7                         1                     4    77
## 8                         1                     1    91
## 9                         2                     7    99
## 10                        8                     7    99
## 11                        5                     4    83
## 12                        2                     3    92
## 13                        4                     7    84
## 14                        2                     4    86
## 15                        2                     3    83
## 16                        2                     7    93
## 17                        5                     8    76
## 18                        5                     3    87
## 19                        6                     7    77
## 20                        1                     2    82
## 21                        2                     5    88
## 22                        3                     3    88
## 23                        3                     3    92
## 24                        2                     4    80
## 25                        6                     3    89
#Dimension of the dataset
dim(Protein_Consumption) # 25 11 [There are 25 rows and 11 columns]
## [1] 25 11
colnames(Protein_Consumption) # printing all the column names
##  [1] "Country"                  "Red.Meat"                
##  [3] "White.Meat"               "Egg"                     
##  [5] "Milk"                     "Fish"                    
##  [7] "Cereals"                  "Starchy.Foods"           
##  [9] "Pulses.Nuts.and.Oilseeds" "Fruits.and.Vegetables"   
## [11] "Total"
attach(Protein_Consumption)

# Printing first 5 rows of the dataset
head(Protein_Consumption)
##          Country Red.Meat White.Meat Egg Milk Fish Cereals Starchy.Foods
## 1        Albania       10          1   1    9    0      42             1
## 2        Austria        9         14   4   20    2      28             4
## 3        Belgium       14          9   4   18    5      27             6
## 4       Bulgaria        8          6   2    8    1      57             1
## 5 Czechoslovakia       10         11   3   13    2      34             5
## 6        Denmark       11         11   4   25   10      22             5
##   Pulses.Nuts.and.Oilseeds Fruits.and.Vegetables Total
## 1                        6                     2    72
## 2                        1                     4    86
## 3                        2                     4    89
## 4                        4                     4    91
## 5                        1                     4    83
## 6                        1                     2    91
# Printing the summary of the dataset
summary(Protein_Consumption)
##    Country             Red.Meat      White.Meat         Egg      
##  Length:25          Min.   : 4.0   Min.   : 1.00   Min.   :1.00  
##  Class :character   1st Qu.: 8.0   1st Qu.: 5.00   1st Qu.:3.00  
##  Mode  :character   Median :10.0   Median : 8.00   Median :3.00  
##                     Mean   : 9.8   Mean   : 7.92   Mean   :3.08  
##                     3rd Qu.:11.0   3rd Qu.:11.00   3rd Qu.:4.00  
##                     Max.   :18.0   Max.   :14.00   Max.   :5.00  
##       Milk            Fish          Cereals      Starchy.Foods 
##  Min.   : 5.00   Min.   : 0.00   Min.   :19.00   Min.   :1.00  
##  1st Qu.:11.00   1st Qu.: 2.00   1st Qu.:24.00   1st Qu.:3.00  
##  Median :18.00   Median : 3.00   Median :28.00   Median :5.00  
##  Mean   :17.28   Mean   : 4.28   Mean   :32.32   Mean   :4.36  
##  3rd Qu.:23.00   3rd Qu.: 6.00   3rd Qu.:40.00   3rd Qu.:6.00  
##  Max.   :34.00   Max.   :14.00   Max.   :57.00   Max.   :7.00  
##  Pulses.Nuts.and.Oilseeds Fruits.and.Vegetables     Total      
##  Min.   :1.00             Min.   :1.0           Min.   :72.00  
##  1st Qu.:2.00             1st Qu.:3.0           1st Qu.:83.00  
##  Median :2.00             Median :4.0           Median :87.00  
##  Mean   :3.08             Mean   :4.2           Mean   :86.32  
##  3rd Qu.:5.00             3rd Qu.:5.0           3rd Qu.:91.00  
##  Max.   :8.00             Max.   :8.0           Max.   :99.00
# Printing the structure of the dataset
str(Protein_Consumption)
## 'data.frame':    25 obs. of  11 variables:
##  $ Country                 : chr  "Albania" "Austria" "Belgium" "Bulgaria" ...
##  $ Red.Meat                : int  10 9 14 8 10 11 8 10 18 10 ...
##  $ White.Meat              : int  1 14 9 6 11 11 12 5 10 3 ...
##  $ Egg                     : int  1 4 4 2 3 4 4 3 3 3 ...
##  $ Milk                    : int  9 20 18 8 13 25 11 34 20 18 ...
##  $ Fish                    : int  0 2 5 1 2 10 5 6 6 6 ...
##  $ Cereals                 : int  42 28 27 57 34 22 25 26 28 42 ...
##  $ Starchy.Foods           : int  1 4 6 1 5 5 7 5 5 2 ...
##  $ Pulses.Nuts.and.Oilseeds: int  6 1 2 4 1 1 1 1 2 8 ...
##  $ Fruits.and.Vegetables   : int  2 4 4 4 4 2 4 1 7 7 ...
##  $ Total                   : int  72 86 89 91 83 91 77 91 99 99 ...
# As we can see that all the columns are of type int except the column Country, which is of type chr

# =============================================================================================

# Question 1 - Use principal components analysis to investigate the relationships between the 
# countries on the basis of these variables

# Getting the Correlations 
cor(Protein_Consumption[-1])
##                             Red.Meat  White.Meat         Egg       Milk
## Red.Meat                  1.00000000  0.18850977  0.57532001  0.5440251
## White.Meat                0.18850977  1.00000000  0.60095535  0.2974816
## Egg                       0.57532001  0.60095535  1.00000000  0.6130310
## Milk                      0.54402512  0.29748163  0.61303102  1.0000000
## Fish                      0.06491072 -0.19719960  0.04780844  0.1624624
## Cereals                  -0.50970337 -0.43941908 -0.70131040 -0.5924925
## Starchy.Foods             0.15383673  0.33456770  0.41266333  0.2144917
## Pulses.Nuts.and.Oilseeds -0.40988882 -0.67214885 -0.59519381 -0.6238357
## Fruits.and.Vegetables    -0.06393465 -0.07329308 -0.16392249 -0.3997753
## Total                     0.37369919  0.10308602  0.18970028  0.4603542
##                                 Fish     Cereals Starchy.Foods
## Red.Meat                  0.06491072 -0.50970337    0.15383673
## White.Meat               -0.19719960 -0.43941908    0.33456770
## Egg                       0.04780844 -0.70131040    0.41266333
## Milk                      0.16246239 -0.59249246    0.21449173
## Fish                      1.00000000 -0.51714759    0.43868411
## Cereals                  -0.51714759  1.00000000   -0.57813449
## Starchy.Foods             0.43868411 -0.57813449    1.00000000
## Pulses.Nuts.and.Oilseeds -0.12226043  0.63605948   -0.49518800
## Fruits.and.Vegetables     0.22948842  0.04229293    0.06835670
## Total                    -0.09089592  0.18587578   -0.04418245
##                          Pulses.Nuts.and.Oilseeds Fruits.and.Vegetables
## Red.Meat                               -0.4098888           -0.06393465
## White.Meat                             -0.6721488           -0.07329308
## Egg                                    -0.5951938           -0.16392249
## Milk                                   -0.6238357           -0.39977527
## Fish                                   -0.1222604            0.22948842
## Cereals                                 0.6360595            0.04229293
## Starchy.Foods                          -0.4951880            0.06835670
## Pulses.Nuts.and.Oilseeds                1.0000000            0.35133227
## Fruits.and.Vegetables                   0.3513323            1.00000000
## Total                                  -0.0812251            0.07201466
##                                Total
## Red.Meat                  0.37369919
## White.Meat                0.10308602
## Egg                       0.18970028
## Milk                      0.46035417
## Fish                     -0.09089592
## Cereals                   0.18587578
## Starchy.Foods            -0.04418245
## Pulses.Nuts.and.Oilseeds -0.08122510
## Fruits.and.Vegetables     0.07201466
## Total                     1.00000000
# Plotting correlation plot to understand the most correlated variables
Protein_Consumption_plot <- cor(Protein_Consumption[-1])
corrplot(Protein_Consumption_plot, method="circle")

# Inferences: 
# We can see from the correlation plot, that Cereal and Eggs are negatively correlated. 
# Also, White Meat and Eggs are positively correlated.

# Finding the principal components of data

# Using prcomp to compute the principal components (eigenvalues and eigenvectors). 
# With scale=TRUE, variable means are set to zero, and variances set to one
Protein_Consumption_pca <- prcomp(Protein_Consumption[,-1],scale=TRUE) 
#Scaling to Standardize the data values
Protein_Consumption_pca
## Standard deviations (1, .., p=10):
##  [1] 2.032257e+00 1.319067e+00 1.144237e+00 1.021544e+00 8.360847e-01
##  [6] 6.531975e-01 5.841454e-01 4.366348e-01 3.458098e-01 6.618503e-16
## 
## Rotation (n x k) = (10 x 10):
##                                 PC1         PC2         PC3          PC4
## Red.Meat                 -0.3180769 -0.17809245 -0.38142753 -0.039766137
## White.Meat               -0.3140588 -0.11783853  0.36420271  0.538507972
## Egg                      -0.4202281 -0.08236350  0.02047575  0.155623651
## Milk                     -0.3870300 -0.23356182 -0.19997405 -0.320360929
## Fish                     -0.1271598  0.57388821 -0.33003267 -0.304161366
## Cereals                   0.4177240 -0.31321549 -0.02354236  0.104798477
## Starchy.Foods            -0.2880798  0.41038324  0.05768490  0.150709175
## Pulses.Nuts.and.Oilseeds  0.4177658  0.04145202 -0.24796403  0.008042093
## Fruits.and.Vegetables     0.1197680  0.34858202 -0.41210384  0.643455476
## Total                    -0.1062294 -0.41709540 -0.58081103  0.203145847
##                                  PC5          PC6         PC7        PC8
## Red.Meat                  0.53138781 -0.393811788  0.42940825 -0.1592276
## White.Meat               -0.09760147  0.309417061  0.09254681 -0.2919567
## Egg                       0.26932734 -0.059357751 -0.63995627 -0.2652806
## Milk                     -0.15848975  0.307976584 -0.17405921  0.5444724
## Fish                     -0.20323386  0.303075844  0.06315829 -0.5200308
## Cereals                  -0.29201244 -0.196460437  0.06971238 -0.2001491
## Starchy.Foods            -0.42198545 -0.680457657 -0.11769041  0.1889672
## Pulses.Nuts.and.Oilseeds  0.22507285 -0.087921207 -0.57816932 -0.0829400
## Fruits.and.Vegetables     0.16834367  0.222568384  0.08684392  0.3701826
## Total                    -0.47623561 -0.007702046 -0.05178373 -0.1801923
##                                  PC9        PC10
## Red.Meat                 -0.17150487  0.20838019
## White.Meat               -0.46186736  0.22903415
## Egg                       0.48098579  0.06827056
## Milk                     -0.13218960  0.43456461
## Fish                      0.01789764  0.21247753
## Cereals                   0.30436394  0.67412235
## Starchy.Foods            -0.14706957  0.10134794
## Pulses.Nuts.and.Oilseeds -0.58938418  0.12362100
## Fruits.and.Vegetables     0.20995988  0.11723988
## Total                    -0.04898111 -0.41440004
# Sample scores -> stored in Protein_Consumption_pca$x 
Protein_Consumption_pca$x  # returns the sample scores for each country along each principal component.
##              PC1         PC2        PC3         PC4          PC5         PC6
##  [1,]  3.5978397 -0.64061101  1.1118946 -1.91119245  1.884437106 -0.37593345
##  [2,] -1.3862854 -0.70991905  1.1613381  0.93107494 -0.009121937  0.75816906
##  [3,] -1.6608482  0.10781730 -0.4231894  0.24680766  0.188016546 -0.91001548
##  [4,]  2.9881523 -1.84361307 -0.0730564  0.30616165 -0.134812297  0.29005421
##  [5,] -0.3686147 -0.10141825  1.2155042  0.72202089 -0.062918010 -0.37091750
##  [6,] -2.4923551  0.18474749 -0.2075253 -0.93906831 -0.822177041  0.65204948
##  [7,] -1.2387459  1.58140979  1.9302394  0.77259151 -0.139755937 -0.58954056
##  [8,] -1.7732789 -0.75352175 -0.3644876 -2.28429396 -1.224019848  0.17828822
##  [9,] -1.6448018 -0.30606640 -2.4846910  1.25325810  0.230223125 -0.33223855
## [10,]  2.0943234 -0.61997417 -3.0846378  0.31332068  0.270784604  0.64981699
## [11,]  1.4808993 -0.43978564  1.6090270  1.21709297 -0.143865961  0.11534733
## [12,] -2.6714332 -1.03848419 -0.2833724  0.15763312  0.181076517 -0.86151844
## [13,]  1.5660043 -0.01064018 -0.5907111  0.54266246  1.069631810  0.77586008
## [14,] -1.7006997 -0.50438298  0.7596605  0.64321026  0.292062273  0.92348043
## [15,] -0.8828201  1.28521025 -0.1832152 -1.71931314 -0.439007528  0.41757899
## [16,] -0.2286613  0.19642466 -0.4058046  1.67696384 -1.334150980  0.08818598
## [17,]  2.0912590  4.41252506 -0.6718598 -0.03434506 -0.291193444  0.33278906
## [18,]  2.6049767 -1.05771521  0.5868844 -0.14252039 -0.533268313 -0.20083289
## [19,]  1.5709389  2.67472726 -0.2892457  0.23912301  0.594881631 -0.60647031
## [20,] -1.8343339  0.36443676  0.5444138 -1.56417414  0.158327086  0.80195706
## [21,] -0.9293183 -0.96269089 -0.3476755  0.27836268  0.755554148  0.70844461
## [22,] -1.9728952 -0.55508144 -0.8727628 -0.60997694  1.396218668 -1.20971357
## [23,]  0.7660628 -0.48463412 -0.2720099 -0.40950179 -1.470304012 -1.24044252
## [24,] -1.6857673  0.30943116  1.2190705  0.55052071  0.810416131  0.20076819
## [25,]  3.7104025 -1.08819138  0.4162119 -0.23641829 -1.227034337 -0.19516642
##                 PC7          PC8          PC9          PC10
##  [1,]  0.6467777066  0.308209567 -0.344610598 -7.771561e-16
##  [2,]  0.0005093868 -0.012933034  0.124176638 -9.471590e-16
##  [3,]  0.1534640851 -0.334041295  0.023323758 -3.330669e-16
##  [4,]  0.5999541449 -0.762640350  0.674235551  1.665335e-16
##  [5,]  0.7878924305 -0.039689570  0.241927022 -6.661338e-16
##  [6,] -0.0364433564 -0.984127670 -0.168254146 -5.551115e-17
##  [7,] -0.0632650200 -0.313388346  0.320254182 -8.881784e-16
##  [8,] -0.0506617637  0.792618282  0.004268287 -6.661338e-16
##  [9,]  1.3629405718 -0.176345585 -0.392094989 -4.440892e-16
## [10,] -1.1867279230 -0.252605939 -0.185325024 -4.440892e-16
## [11,] -0.8173673169 -0.201792286 -0.496946360 -9.714451e-16
## [12,] -0.7338089555  0.194588527 -0.047542669 -5.551115e-16
## [13,]  0.0085984337  0.435335074  0.815121519 -8.326673e-16
## [14,] -0.2530352518  0.088559649 -0.434700410 -1.051242e-15
## [15,]  0.0122896156  0.009259812  0.182509788 -4.718448e-16
## [16,] -0.0295375727  0.839590880  0.341088667 -7.771561e-16
## [17,]  0.6466024099 -0.205548666 -0.304794550 -1.110223e-15
## [18,] -0.2135771460 -0.211277632 -0.024663621 -3.677614e-16
## [19,] -0.9520057576  0.408309790  0.166895175 -1.221245e-15
## [20,] -0.1459371778 -0.241698086  0.340921291 -6.106227e-16
## [21,]  0.6841927749  0.678069260 -0.252544924 -1.040834e-15
## [22,] -0.4798955917 -0.365578790  0.224480622 -2.636780e-16
## [23,]  0.3126133335  0.287576242 -0.038575860 -1.110223e-16
## [24,] -0.0977006735  0.141407912 -0.415483582 -1.110223e-15
## [25,] -0.1558713867 -0.081857747 -0.353665768 -4.718448e-16
# Sample scores (also called factor scores) are the values of each observation 
# (in this case, each country) along each principal component (PC).
# Sample scores basically tells how much each observation contributes to each PC

# Singular values (square roots of eigenvalues) -> stored in Protein_Consumption_pca$sdev
Protein_Consumption_pca$sdev
##  [1] 2.032257e+00 1.319067e+00 1.144237e+00 1.021544e+00 8.360847e-01
##  [6] 6.531975e-01 5.841454e-01 4.366348e-01 3.458098e-01 6.618503e-16
# Loadings (eigenvectors) -> stored in Protein_Consumption_pca$rotation
Protein_Consumption_pca$rotation # provides the matrix of loadings or principal component coefficients.
##                                 PC1         PC2         PC3          PC4
## Red.Meat                 -0.3180769 -0.17809245 -0.38142753 -0.039766137
## White.Meat               -0.3140588 -0.11783853  0.36420271  0.538507972
## Egg                      -0.4202281 -0.08236350  0.02047575  0.155623651
## Milk                     -0.3870300 -0.23356182 -0.19997405 -0.320360929
## Fish                     -0.1271598  0.57388821 -0.33003267 -0.304161366
## Cereals                   0.4177240 -0.31321549 -0.02354236  0.104798477
## Starchy.Foods            -0.2880798  0.41038324  0.05768490  0.150709175
## Pulses.Nuts.and.Oilseeds  0.4177658  0.04145202 -0.24796403  0.008042093
## Fruits.and.Vegetables     0.1197680  0.34858202 -0.41210384  0.643455476
## Total                    -0.1062294 -0.41709540 -0.58081103  0.203145847
##                                  PC5          PC6         PC7        PC8
## Red.Meat                  0.53138781 -0.393811788  0.42940825 -0.1592276
## White.Meat               -0.09760147  0.309417061  0.09254681 -0.2919567
## Egg                       0.26932734 -0.059357751 -0.63995627 -0.2652806
## Milk                     -0.15848975  0.307976584 -0.17405921  0.5444724
## Fish                     -0.20323386  0.303075844  0.06315829 -0.5200308
## Cereals                  -0.29201244 -0.196460437  0.06971238 -0.2001491
## Starchy.Foods            -0.42198545 -0.680457657 -0.11769041  0.1889672
## Pulses.Nuts.and.Oilseeds  0.22507285 -0.087921207 -0.57816932 -0.0829400
## Fruits.and.Vegetables     0.16834367  0.222568384  0.08684392  0.3701826
## Total                    -0.47623561 -0.007702046 -0.05178373 -0.1801923
##                                  PC9        PC10
## Red.Meat                 -0.17150487  0.20838019
## White.Meat               -0.46186736  0.22903415
## Egg                       0.48098579  0.06827056
## Milk                     -0.13218960  0.43456461
## Fish                      0.01789764  0.21247753
## Cereals                   0.30436394  0.67412235
## Starchy.Foods            -0.14706957  0.10134794
## Pulses.Nuts.and.Oilseeds -0.58938418  0.12362100
## Fruits.and.Vegetables     0.20995988  0.11723988
## Total                    -0.04898111 -0.41440004
# Variable means -> stored in Protein_Consumption_pca$center
Protein_Consumption_pca$center
##                 Red.Meat               White.Meat                      Egg 
##                     9.80                     7.92                     3.08 
##                     Milk                     Fish                  Cereals 
##                    17.28                     4.28                    32.32 
##            Starchy.Foods Pulses.Nuts.and.Oilseeds    Fruits.and.Vegetables 
##                     4.36                     3.08                     4.20 
##                    Total 
##                    86.32
# Variable standard deviations -> stored in Protein_Consumption_pca$scale
Protein_Consumption_pca$scale
##                 Red.Meat               White.Meat                      Egg 
##                 3.403430                 3.740766                 1.115049 
##                     Milk                     Fish                  Cereals 
##                 7.097652                 3.470351                11.010298 
##            Starchy.Foods Pulses.Nuts.and.Oilseeds    Fruits.and.Vegetables 
##                 1.655295                 2.019076                 1.914854 
##                    Total 
##                 6.768309
# A table containing eigenvalues and %'s accounted, follows
# Eigenvalues are sdev^2

#Extract variance against features
eigenvalues<-Protein_Consumption_pca$sdev^2
eigenvalues
##  [1] 4.130067e+00 1.739939e+00 1.309278e+00 1.043551e+00 6.990377e-01
##  [6] 4.266669e-01 3.412258e-01 1.906500e-01 1.195844e-01 4.380459e-31
sum(eigenvalues)
## [1] 10
names(eigenvalues) <- paste("PC",1:10,sep="")
eigenvalues
##          PC1          PC2          PC3          PC4          PC5          PC6 
## 4.130067e+00 1.739939e+00 1.309278e+00 1.043551e+00 6.990377e-01 4.266669e-01 
##          PC7          PC8          PC9         PC10 
## 3.412258e-01 1.906500e-01 1.195844e-01 4.380459e-31
sumoflambdas <- sum(eigenvalues)
sumoflambdas
## [1] 10
#Variance %
bfcvar<- (eigenvalues/sumoflambdas)*100
round(bfcvar,10)
##       PC1       PC2       PC3       PC4       PC5       PC6       PC7       PC8 
## 41.300672 17.399386 13.092782 10.435513  6.990377  4.266669  3.412258  1.906500 
##       PC9      PC10 
##  1.195844  0.000000
barplot(bfcvar,main="Scree plot",xlab="Principal Component",ylab="Percent Variation")

#Calculate cumulative of variance
cumvar <- cumsum(bfcvar)
cumvar
##       PC1       PC2       PC3       PC4       PC5       PC6       PC7       PC8 
##  41.30067  58.70006  71.79284  82.22835  89.21873  93.48540  96.89766  98.80416 
##       PC9      PC10 
## 100.00000 100.00000
matlambdas <- rbind(eigenvalues,bfcvar,cumvar)
round(matlambdas,10)
##                   PC1       PC2       PC3       PC4        PC5        PC6
## eigenvalues  4.130067  1.739939  1.309278  1.043551  0.6990377  0.4266669
## bfcvar      41.300672 17.399386 13.092782 10.435513  6.9903765  4.2666693
## cumvar      41.300672 58.700057 71.792839 82.228352 89.2187289 93.4853982
##                    PC7      PC8         PC9 PC10
## eigenvalues  0.3412258  0.19065   0.1195844    0
## bfcvar       3.4122581  1.90650   1.1958440    0
## cumvar      96.8976563 98.80416 100.0000000  100
eigenvec_Protein_Consumption <- Protein_Consumption_pca$rotation
eigenvec_Protein_Consumption
##                                 PC1         PC2         PC3          PC4
## Red.Meat                 -0.3180769 -0.17809245 -0.38142753 -0.039766137
## White.Meat               -0.3140588 -0.11783853  0.36420271  0.538507972
## Egg                      -0.4202281 -0.08236350  0.02047575  0.155623651
## Milk                     -0.3870300 -0.23356182 -0.19997405 -0.320360929
## Fish                     -0.1271598  0.57388821 -0.33003267 -0.304161366
## Cereals                   0.4177240 -0.31321549 -0.02354236  0.104798477
## Starchy.Foods            -0.2880798  0.41038324  0.05768490  0.150709175
## Pulses.Nuts.and.Oilseeds  0.4177658  0.04145202 -0.24796403  0.008042093
## Fruits.and.Vegetables     0.1197680  0.34858202 -0.41210384  0.643455476
## Total                    -0.1062294 -0.41709540 -0.58081103  0.203145847
##                                  PC5          PC6         PC7        PC8
## Red.Meat                  0.53138781 -0.393811788  0.42940825 -0.1592276
## White.Meat               -0.09760147  0.309417061  0.09254681 -0.2919567
## Egg                       0.26932734 -0.059357751 -0.63995627 -0.2652806
## Milk                     -0.15848975  0.307976584 -0.17405921  0.5444724
## Fish                     -0.20323386  0.303075844  0.06315829 -0.5200308
## Cereals                  -0.29201244 -0.196460437  0.06971238 -0.2001491
## Starchy.Foods            -0.42198545 -0.680457657 -0.11769041  0.1889672
## Pulses.Nuts.and.Oilseeds  0.22507285 -0.087921207 -0.57816932 -0.0829400
## Fruits.and.Vegetables     0.16834367  0.222568384  0.08684392  0.3701826
## Total                    -0.47623561 -0.007702046 -0.05178373 -0.1801923
##                                  PC9        PC10
## Red.Meat                 -0.17150487  0.20838019
## White.Meat               -0.46186736  0.22903415
## Egg                       0.48098579  0.06827056
## Milk                     -0.13218960  0.43456461
## Fish                      0.01789764  0.21247753
## Cereals                   0.30436394  0.67412235
## Starchy.Foods            -0.14706957  0.10134794
## Pulses.Nuts.and.Oilseeds -0.58938418  0.12362100
## Fruits.and.Vegetables     0.20995988  0.11723988
## Total                    -0.04898111 -0.41440004
# Relation between the variables w.r.t countries
Protein_Consumption_env <- cbind(data.frame(Country),Protein_Consumption_pca$x)
Protein_Consumption_env
##           Country        PC1         PC2        PC3         PC4          PC5
## 1         Albania  3.5978397 -0.64061101  1.1118946 -1.91119245  1.884437106
## 2         Austria -1.3862854 -0.70991905  1.1613381  0.93107494 -0.009121937
## 3         Belgium -1.6608482  0.10781730 -0.4231894  0.24680766  0.188016546
## 4        Bulgaria  2.9881523 -1.84361307 -0.0730564  0.30616165 -0.134812297
## 5  Czechoslovakia -0.3686147 -0.10141825  1.2155042  0.72202089 -0.062918010
## 6         Denmark -2.4923551  0.18474749 -0.2075253 -0.93906831 -0.822177041
## 7    East Germany -1.2387459  1.58140979  1.9302394  0.77259151 -0.139755937
## 8         Finland -1.7732789 -0.75352175 -0.3644876 -2.28429396 -1.224019848
## 9          France -1.6448018 -0.30606640 -2.4846910  1.25325810  0.230223125
## 10         Greece  2.0943234 -0.61997417 -3.0846378  0.31332068  0.270784604
## 11        Hungary  1.4808993 -0.43978564  1.6090270  1.21709297 -0.143865961
## 12        Ireland -2.6714332 -1.03848419 -0.2833724  0.15763312  0.181076517
## 13          Italy  1.5660043 -0.01064018 -0.5907111  0.54266246  1.069631810
## 14    Netherlands -1.7006997 -0.50438298  0.7596605  0.64321026  0.292062273
## 15         Norway -0.8828201  1.28521025 -0.1832152 -1.71931314 -0.439007528
## 16         Poland -0.2286613  0.19642466 -0.4058046  1.67696384 -1.334150980
## 17       Portugal  2.0912590  4.41252506 -0.6718598 -0.03434506 -0.291193444
## 18        Romania  2.6049767 -1.05771521  0.5868844 -0.14252039 -0.533268313
## 19          Spain  1.5709389  2.67472726 -0.2892457  0.23912301  0.594881631
## 20         Sweden -1.8343339  0.36443676  0.5444138 -1.56417414  0.158327086
## 21    Switzerland -0.9293183 -0.96269089 -0.3476755  0.27836268  0.755554148
## 22 United Kingdom -1.9728952 -0.55508144 -0.8727628 -0.60997694  1.396218668
## 23           USSR  0.7660628 -0.48463412 -0.2720099 -0.40950179 -1.470304012
## 24   West Germany -1.6857673  0.30943116  1.2190705  0.55052071  0.810416131
## 25     Yugoslavia  3.7104025 -1.08819138  0.4162119 -0.23641829 -1.227034337
##            PC6           PC7          PC8          PC9          PC10
## 1  -0.37593345  0.6467777066  0.308209567 -0.344610598 -7.771561e-16
## 2   0.75816906  0.0005093868 -0.012933034  0.124176638 -9.471590e-16
## 3  -0.91001548  0.1534640851 -0.334041295  0.023323758 -3.330669e-16
## 4   0.29005421  0.5999541449 -0.762640350  0.674235551  1.665335e-16
## 5  -0.37091750  0.7878924305 -0.039689570  0.241927022 -6.661338e-16
## 6   0.65204948 -0.0364433564 -0.984127670 -0.168254146 -5.551115e-17
## 7  -0.58954056 -0.0632650200 -0.313388346  0.320254182 -8.881784e-16
## 8   0.17828822 -0.0506617637  0.792618282  0.004268287 -6.661338e-16
## 9  -0.33223855  1.3629405718 -0.176345585 -0.392094989 -4.440892e-16
## 10  0.64981699 -1.1867279230 -0.252605939 -0.185325024 -4.440892e-16
## 11  0.11534733 -0.8173673169 -0.201792286 -0.496946360 -9.714451e-16
## 12 -0.86151844 -0.7338089555  0.194588527 -0.047542669 -5.551115e-16
## 13  0.77586008  0.0085984337  0.435335074  0.815121519 -8.326673e-16
## 14  0.92348043 -0.2530352518  0.088559649 -0.434700410 -1.051242e-15
## 15  0.41757899  0.0122896156  0.009259812  0.182509788 -4.718448e-16
## 16  0.08818598 -0.0295375727  0.839590880  0.341088667 -7.771561e-16
## 17  0.33278906  0.6466024099 -0.205548666 -0.304794550 -1.110223e-15
## 18 -0.20083289 -0.2135771460 -0.211277632 -0.024663621 -3.677614e-16
## 19 -0.60647031 -0.9520057576  0.408309790  0.166895175 -1.221245e-15
## 20  0.80195706 -0.1459371778 -0.241698086  0.340921291 -6.106227e-16
## 21  0.70844461  0.6841927749  0.678069260 -0.252544924 -1.040834e-15
## 22 -1.20971357 -0.4798955917 -0.365578790  0.224480622 -2.636780e-16
## 23 -1.24044252  0.3126133335  0.287576242 -0.038575860 -1.110223e-16
## 24  0.20076819 -0.0977006735  0.141407912 -0.415483582 -1.110223e-15
## 25 -0.19516642 -0.1558713867 -0.081857747 -0.353665768 -4.718448e-16
#Visualize PCA using Scree plot
screeplot(Protein_Consumption_pca, type='bar',main='Scree plot')

summary(Protein_Consumption_pca)
## Importance of components:
##                          PC1   PC2    PC3    PC4    PC5     PC6     PC7     PC8
## Standard deviation     2.032 1.319 1.1442 1.0215 0.8361 0.65320 0.58415 0.43663
## Proportion of Variance 0.413 0.174 0.1309 0.1044 0.0699 0.04267 0.03412 0.01906
## Cumulative Proportion  0.413 0.587 0.7179 0.8223 0.8922 0.93485 0.96898 0.98804
##                            PC9      PC10
## Standard deviation     0.34581 6.619e-16
## Proportion of Variance 0.01196 0.000e+00
## Cumulative Proportion  1.00000 1.000e+00
# Principal Component Analysis is used to reduce the large set of variables to a 
# small set that still contains most of the information from the large dataset and thereby, 
# reducing the dimension of the dataset.
# After performing PCA on the Protein Consumption Dataset, we obtain 10 Principal components. 
# Each of these components represent the percentage of variability present in the dataset.
# In other words, PC1 explains 41.3% of total variance, PC2 explains 17.4% and so on. 
# We will consider the first six principal components as they sum up to 93% of total variance 
# and the other four can be discarded as they contribute to only 7% of total variance.


# Better Ways to Visualize

library(factoextra)
library(FactoMineR)
library(ggfortify)
library(psych)
library(corrplot)
library(devtools)

# Correlation
pairs.panels(Protein_Consumption[,-1],
             gap = 0,
             bg = c("red", "blue")[Protein_Consumption$Country],
             pch=21)

pairs.panels(Protein_Consumption_pca$x,
             gap=0,
             bg = c("red", "blue")[Protein_Consumption$Country],
             pch=21)

# Variables - PCA (cos2)
fviz_eig(Protein_Consumption_pca, addlabels = TRUE)

fviz_pca_var(Protein_Consumption_pca,col.var = "cos2",
             gradient.cols = c("#FFCC00", "#CC9933", "#660033", "#330033"),
             repel = TRUE)

# The plot above is also known as Variable Correlation plots. 
# It shows the relationships between all variables. It can be interpreted as follow:
# Positively correlated variables are grouped together.
# Negatively correlated variables are positioned on opposite sides of the plot origin (opposed quadrants).
# The distance between variables and the origin measures the quality of the variables on the factor map. 
# Variables that are away from the origin are well represented on the factor map.


# Individuals - PCA (cos2)
fviz_pca_ind(Protein_Consumption_pca, col.ind = "cos2", 
             gradient.cols = c("#FFCC00", "#CC9933", "#660033", "#330033"), 
             repel = TRUE)

# Biplot
biplot(Protein_Consumption_pca)

autoplot(Protein_Consumption_pca,
         data = Protein_Consumption[,-1],
         loadings = TRUE,
         labels = Protein_Consumption$Country)

# Different PCA Method. 
res.pca <- PCA(Protein_Consumption[,-1], graph = FALSE)
print(res.pca)
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 25 individuals, described by 10 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"
# Visualize and Interpret PCA using these functions 

# get_eigenvalue(res.pca): Extract the eigenvalues/variances of principal components
# fviz_eig(res.pca): Visualize the eigenvalues
# get_pca_ind(res.pca), get_pca_var(res.pca): Extract the results for individuals and variables, respectively.
# fviz_pca_ind(res.pca), fviz_pca_var(res.pca): Visualize the results individuals and variables, respectively.
# fviz_pca_biplot(res.pca): Make a biplot of individuals and variables.

eig.val <- get_eigenvalue(res.pca)
eig.val
##          eigenvalue variance.percent cumulative.variance.percent
## Dim.1  4.130067e+00     4.130067e+01                    41.30067
## Dim.2  1.739939e+00     1.739939e+01                    58.70006
## Dim.3  1.309278e+00     1.309278e+01                    71.79284
## Dim.4  1.043551e+00     1.043551e+01                    82.22835
## Dim.5  6.990377e-01     6.990377e+00                    89.21873
## Dim.6  4.266669e-01     4.266669e+00                    93.48540
## Dim.7  3.412258e-01     3.412258e+00                    96.89766
## Dim.8  1.906500e-01     1.906500e+00                    98.80416
## Dim.9  1.195844e-01     1.195844e+00                   100.00000
## Dim.10 1.906537e-31     1.906537e-30                   100.00000
# Scree Plot
fviz_eig(res.pca, addlabels = TRUE, ylim = c(0, 50))

# As we can see that we got the same plot with this method also.

var <- get_pca_var(res.pca)
# var$coord: coordinates of variables to create a scatter plot
# var$cos2: represents the quality of representation for variables on the factor map. It’s calculated as the squared coordinates: var.cos2 = var.coord * var.coord.
# var$contrib: contains the contributions (in percentage) of the variables to the principal components. 
# The contribution of a variable (var) to a given principal component is (in percentage) : (var.cos2 * 100) / (total cos2 of the component).
var
## Principal Component Analysis Results for variables
##  ===================================================
##   Name       Description                                    
## 1 "$coord"   "Coordinates for the variables"                
## 2 "$cor"     "Correlations between variables and dimensions"
## 3 "$cos2"    "Cos2 for the variables"                       
## 4 "$contrib" "contributions of the variables"
# Coordinates
head(var$coord)
##                 Dim.1      Dim.2       Dim.3       Dim.4       Dim.5
## Red.Meat    0.6464140 -0.2349159  0.43644348 -0.04062284 -0.44428523
## White.Meat  0.6382482 -0.1554370 -0.41673420  0.55010936  0.08160309
## Egg         0.8540114 -0.1086430 -0.02342911  0.15897634 -0.22518048
## Milk        0.7865443 -0.3080838  0.22881770 -0.32726265  0.13251086
## Fish        0.2584213  0.7569972  0.37763557 -0.31071409  0.16992073
## Cereals    -0.8489223 -0.4131523  0.02693804  0.10705621  0.24414713
# Cos2: quality on the factore map
head(var$cos2)
##                 Dim.1      Dim.2        Dim.3       Dim.4       Dim.5
## Red.Meat   0.41785102 0.05518550 0.1904829088 0.001650215 0.197389362
## White.Meat 0.40736071 0.02416065 0.1736673957 0.302620310 0.006659065
## Egg        0.72933547 0.01180330 0.0005489233 0.025273477 0.050706246
## Milk       0.61865187 0.09491560 0.0523575399 0.107100842 0.017559128
## Fish       0.06678156 0.57304473 0.1426086263 0.096543246 0.028873053
## Cereals    0.72066906 0.17069483 0.0007256581 0.011461032 0.059607823
# Contributions to the principal components
head(var$contrib)
##                Dim.1      Dim.2       Dim.3      Dim.4      Dim.5
## Red.Meat   10.117293  3.1716922 14.54869623  0.1581346 28.2373005
## White.Meat  9.863295  1.3885919 13.26436162 28.9990836  0.9526046
## Egg        17.659167  0.6783745  0.04192564  2.4218721  7.2537218
## Milk       14.979220  5.4551123  3.99896215 10.2631125  2.5119001
## Fish        1.616961 32.9347677 10.89215614  9.2514136  4.1304003
## Cereals    17.449330  9.8103941  0.05542429  1.0982721  8.5271262
# Correlation circle
fviz_pca_var(res.pca, col.var = "black")

# Quality of representation

corrplot(var$cos2, is.corr=FALSE)

# Total cos2 of variables on Dim.1 and Dim.2
# A high cos2 indicates a good representation of the variable on the principal component,
# -> In this case the variable is positioned close to the circumference of the correlation circle.

# A low cos2 indicates that the variable is not perfectly represented by the PCs,
# -> In this case the variable is close to the center of the circle.

fviz_cos2(res.pca, choice = "var", axes = 1:2)

fviz_pca_var(res.pca, col.var = "cos2",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
             repel = TRUE # Avoid text overlapping
)

# Change the transparency by cos2 values
fviz_pca_var(res.pca, alpha.var = "cos2")

corrplot(var$contrib, is.corr=FALSE)

# Contributions of variables to PC1
fviz_contrib(res.pca, choice = "var", axes = 1, top = 10)

# Contributions of variables to PC2
fviz_contrib(res.pca, choice = "var", axes = 2, top = 10)

fviz_pca_var(res.pca, col.var = "contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")
)

fviz_pca_var(res.pca, alpha.var = "contrib")

# =============================================================================================

# Question 2 - Carry out cluster analysis to study relation between countries on their diet


# K-means Clustering

# Standardizing the data with scale()
matstd.Protein_Consumption <- scale(Protein_Consumption[-1])
matstd.Protein_Consumption
##          Red.Meat  White.Meat         Egg        Milk        Fish    Cereals
##  [1,]  0.05876425 -1.84988830 -1.86538958 -1.16658295 -1.23330478  0.8791769
##  [2,] -0.23505701  1.62533538  0.82507616  0.38322532 -0.65699414 -0.3923599
##  [3,]  1.23404931  0.28871089  0.82507616  0.10144200  0.20747183 -0.4831840
##  [4,] -0.52887828 -0.51326380 -0.96856767 -1.30747461 -0.94514946  2.2415378
##  [5,]  0.05876425  0.82336069 -0.07174575 -0.60301630 -0.65699414  0.1525844
##  [6,]  0.35258552  0.82336069  0.82507616  1.08768362  1.64824845 -0.9373043
##  [7,] -0.52887828  1.09068558  0.82507616 -0.88479963  0.20747183 -0.6648321
##  [8,]  0.05876425 -0.78058870 -0.07174575  2.35570856  0.49562716 -0.5740081
##  [9,]  2.40933437  0.55603579 -0.07174575  0.38322532  0.49562716 -0.3923599
## [10,]  0.05876425 -1.31523850 -0.07174575  0.10144200  0.49562716  0.8791769
## [11,] -1.41034207  1.09068558 -0.07174575 -1.02569129 -1.23330478  0.6975288
## [12,]  1.23404931  0.55603579  1.72189807  1.22857528 -0.65699414 -0.7556562
## [13,] -0.23505701 -0.78058870 -0.07174575 -0.46212464 -0.36883881  0.4250566
## [14,]  0.05876425  1.62533538  0.82507616  0.80590030 -0.36883881 -0.9373043
## [15,] -0.23505701 -0.78058870 -0.07174575  0.80590030  1.64824845 -0.8464803
## [16,] -0.82269954  0.55603579 -0.07174575  0.24233366 -0.36883881  0.3342325
## [17,] -1.11652080 -1.04791360 -1.86538958 -1.73014959  2.80086974 -0.4831840
## [18,] -1.11652080 -0.51326380 -0.96856767 -0.88479963 -0.94514946  1.6057694
## [19,] -0.82269954 -1.31523850 -0.07174575 -1.16658295  0.78378248 -0.3015359
## [20,]  0.05876425  0.02138599  0.82507616  1.08768362  1.07193780 -1.1189524
## [21,]  0.94022805  0.55603579 -0.07174575  0.94679196 -0.65699414 -0.5740081
## [22,]  2.11551310 -0.51326380  1.72189807  0.52411698 -0.08068349 -0.7556562
## [23,] -0.23505701 -0.78058870 -0.96856767 -0.03944966 -0.36883881  1.0608250
## [24,]  0.35258552  1.35801048  0.82507616  0.24233366 -0.36883881 -1.2097765
## [25,] -1.70416333 -0.78058870 -1.86538958 -1.02569129 -0.94514946  2.1507138
##       Starchy.Foods Pulses.Nuts.and.Oilseeds Fruits.and.Vegetables       Total
##  [1,]    -2.0298502               1.44620630            -1.1489125 -2.11574280
##  [2,]    -0.2174840              -1.03017435            -0.1044466 -0.04727917
##  [3,]     0.9907602              -0.53489822            -0.1044466  0.39596304
##  [4,]    -2.0298502               0.45565404            -0.1044466  0.69145784
##  [5,]     0.3866381              -1.03017435            -0.1044466 -0.49052138
##  [6,]     0.3866381              -1.03017435            -1.1489125  0.69145784
##  [7,]     1.5948823              -1.03017435            -0.1044466 -1.37700579
##  [8,]     0.3866381              -1.03017435            -1.6711455  0.69145784
##  [9,]     0.3866381              -0.53489822             1.4622523  1.87343706
## [10,]    -1.4257281               2.43675857             1.4622523  1.87343706
## [11,]    -0.2174840               0.95093017            -0.1044466 -0.49052138
## [12,]     0.9907602              -0.53489822            -0.6266796  0.83920525
## [13,]    -1.4257281               0.45565404             1.4622523 -0.34277397
## [14,]    -0.2174840              -0.53489822            -0.1044466 -0.04727917
## [15,]     0.3866381              -0.53489822            -0.6266796 -0.49052138
## [16,]     0.9907602              -0.53489822             1.4622523  0.98695265
## [17,]     0.9907602               0.95093017             1.9844853 -1.52475319
## [18,]    -0.8216060               0.95093017            -0.6266796  0.10046823
## [19,]     0.9907602               1.44620630             1.4622523 -1.37700579
## [20,]    -0.2174840              -1.03017435            -1.1489125 -0.63826878
## [21,]    -0.8216060              -0.53489822             0.4177864  0.24821564
## [22,]     0.3866381              -0.03962209            -0.6266796  0.24821564
## [23,]     0.9907602              -0.03962209            -0.6266796  0.83920525
## [24,]     0.3866381              -0.53489822            -0.1044466 -0.93376358
## [25,]    -0.8216060               1.44620630            -0.6266796  0.39596304
## attr(,"scaled:center")
##                 Red.Meat               White.Meat                      Egg 
##                     9.80                     7.92                     3.08 
##                     Milk                     Fish                  Cereals 
##                    17.28                     4.28                    32.32 
##            Starchy.Foods Pulses.Nuts.and.Oilseeds    Fruits.and.Vegetables 
##                     4.36                     3.08                     4.20 
##                    Total 
##                    86.32 
## attr(,"scaled:scale")
##                 Red.Meat               White.Meat                      Egg 
##                 3.403430                 3.740766                 1.115049 
##                     Milk                     Fish                  Cereals 
##                 7.097652                 3.470351                11.010298 
##            Starchy.Foods Pulses.Nuts.and.Oilseeds    Fruits.and.Vegetables 
##                 1.655295                 2.019076                 1.914854 
##                    Total 
##                 6.768309
# kmeans() is the function for  non-hierarchical method. 
rownames(matstd.Protein_Consumption) <- Protein_Consumption$Country

# For 2 clusters, k-means = 2
kmeans2.Protein_Consumption <- kmeans(matstd.Protein_Consumption,2,nstart = 10)

# Computing the percentage of variation accounted for two  clusters
perc.var.2 <- round(100*(1 - kmeans2.Protein_Consumption$betweenss/kmeans2.Protein_Consumption$totss),1)
names(perc.var.2) <- "Perc. 2 clus"
perc.var.2
## Perc. 2 clus 
##         64.7
# For 3 clusters, k-means = 3
kmeans3.Protein_Consumption <- kmeans(matstd.Protein_Consumption,3,nstart = 10)
# Computing the percentage of variation accounted for three  clusters
perc.var.3 <- round(100*(1 - kmeans3.Protein_Consumption$betweenss/kmeans3.Protein_Consumption$totss),1)
names(perc.var.3) <- "Perc. 3 clus"
perc.var.3
## Perc. 3 clus 
##         51.9
# For 4 clusters, k-means = 4
kmeans4.Protein_Consumption <- kmeans(matstd.Protein_Consumption,4,nstart = 10)
# Computing the percentage of variation accounted for four clusters
perc.var.4 <- round(100*(1 - kmeans4.Protein_Consumption$betweenss/kmeans4.Protein_Consumption$totss),1)
names(perc.var.4) <- "Perc. 4 clus"
perc.var.4
## Perc. 4 clus 
##           44
# We divide the dataset into two clusters.
# Filtering properties which are in 1 cluster of k mean 2
clus1 <- matrix(names(kmeans2.Protein_Consumption$cluster[kmeans2.Protein_Consumption$cluster == 1]),
                ncol=1, nrow=length(kmeans2.Protein_Consumption$cluster[kmeans2.Protein_Consumption$cluster == 1]))
colnames(clus1) <- "Cluster 1"
clus1
##       Cluster 1   
##  [1,] "Albania"   
##  [2,] "Bulgaria"  
##  [3,] "Greece"    
##  [4,] "Hungary"   
##  [5,] "Italy"     
##  [6,] "Portugal"  
##  [7,] "Romania"   
##  [8,] "Spain"     
##  [9,] "USSR"      
## [10,] "Yugoslavia"
# Filtering properties which are in 2 cluster of k mean 2
clus2 <- matrix(names(kmeans2.Protein_Consumption$cluster[kmeans2.Protein_Consumption$cluster == 2]),
                ncol=1, nrow=length(kmeans2.Protein_Consumption$cluster[kmeans2.Protein_Consumption$cluster == 2]))
colnames(clus2) <- "Cluster 2"
clus2
##       Cluster 2       
##  [1,] "Austria"       
##  [2,] "Belgium"       
##  [3,] "Czechoslovakia"
##  [4,] "Denmark"       
##  [5,] "East Germany"  
##  [6,] "Finland"       
##  [7,] "France"        
##  [8,] "Ireland"       
##  [9,] "Netherlands"   
## [10,] "Norway"        
## [11,] "Poland"        
## [12,] "Sweden"        
## [13,] "Switzerland"   
## [14,] "United Kingdom"
## [15,] "West Germany"
# We can use the hierarchical clustering method, which allows us to create a dendrogram 
# to visualize the relationships between the countries based on their protein consumption from different food sources.
# Hierarchical Clustering

# Calculating the distance matrix using Euclidean distance
dist.Protein_Consumption <- dist(Protein_Consumption, method = "euclidean") # Distance matrix
## Warning in dist(Protein_Consumption, method = "euclidean"): NAs introduced by
## coercion
# Performing hierarchical clustering using complete linkage
fit <- hclust(dist.Protein_Consumption, method="complete")

#Plotting the dendogram

plot(fit, main="Dendrogram of European countries based on diet")

#Cutting the tree into 2 clusters
groups <- cutree(fit, k=2)

#Plotting dendogram with red borders around the 2 clusters
rect.hclust(fit, k=2, border="red")

# From the dendrogram, we can see that the countries can be grouped into two main clusters: 
# one containing mostly Northern and Western European countries, and another containing mostly 
# Southern and Eastern European countries.

# The countries in the second cluster (Northern and Western Europe) generally consume more meat, 
# fish, milk, and cereals, while the countries in the first cluster (Southern and Eastern Europe) 
# consume more fruits, vegetables, and pulses/nuts/oilseeds.

# Overall, the hierarchical clustering analysis confirms some of the dietary patterns and 
# differences that we observed in the earlier principal components analysis.


# Clustering is a method of grouping together a set of objects together in such a way 
# that the objects in one cluster is similar to the objects in the same cluster than 
# objects present in the different cluster.
# We form 2 clusters for the given dataset as seen in the dendogram. 
# This covers all the variance present in the dataset.


# ??????????????????????????


library(cluster)
library(factoextra)
library(magrittr)
library(NbClust)

# We will use agnes function as it allows us to select option for data standardization, the distance measure and clustering algorithm in one single function

(dist.Protein_Consumption <- agnes(Protein_Consumption, metric="euclidean", stand=TRUE, method = "single"))
## Call:     agnes(x = Protein_Consumption, metric = "euclidean", stand = TRUE, method = "single") 
## Agglomerative coefficient:  0.375437 
## Order of objects:
##  [1]  1  2 14 24  5 21  7  3 12 22  6 15 20  8  4 11 18 25 23  9 16 13 10 17 19
## Height (summary):
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.107   2.739   3.162   3.174   3.700   4.831 
## 
## Available components:
## [1] "order"  "height" "ac"     "merge"  "diss"   "call"   "method" "data"
#View(dist.Protein_Consumption)

# Description of cluster merging
dist.Protein_Consumption$merge
##       [,1] [,2]
##  [1,]  -18  -25
##  [2,]   -2  -14
##  [3,]    2  -24
##  [4,]  -15  -20
##  [5,]    3   -5
##  [6,]    5  -21
##  [7,]   -3  -12
##  [8,]    6   -7
##  [9,]    7  -22
## [10,]  -11    1
## [11,]    8    9
## [12,]   -6    4
## [13,]   11   12
## [14,]   -4   10
## [15,]   13   -8
## [16,]   14  -23
## [17,]   15   16
## [18,]   17   -9
## [19,]  -17  -19
## [20,]   18  -16
## [21,]   20  -13
## [22,]   21  -10
## [23,]   22   19
## [24,]   -1   23
# Dendogram
plot(as.dendrogram(dist.Protein_Consumption), xlab= "Distance",xlim=c(8,0),
     horiz = TRUE,main="Dendrogram")

#Interactive Plots
#plot(agn.employ,ask=TRUE)
plot(dist.Protein_Consumption, which.plots=1)

plot(dist.Protein_Consumption, which.plots=2)

plot(dist.Protein_Consumption, which.plots=3)


# =============================================================================================

# Question 3 - Identify the important factors underlying the observed variables 
# and examine the relationships between the countries with respect to these factors


# Computing Correlation Matrix
corrm.Protein_Consumption <- cor(Protein_Consumption[c(-1,-11)])
corrm.Protein_Consumption
##                             Red.Meat  White.Meat         Egg       Milk
## Red.Meat                  1.00000000  0.18850977  0.57532001  0.5440251
## White.Meat                0.18850977  1.00000000  0.60095535  0.2974816
## Egg                       0.57532001  0.60095535  1.00000000  0.6130310
## Milk                      0.54402512  0.29748163  0.61303102  1.0000000
## Fish                      0.06491072 -0.19719960  0.04780844  0.1624624
## Cereals                  -0.50970337 -0.43941908 -0.70131040 -0.5924925
## Starchy.Foods             0.15383673  0.33456770  0.41266333  0.2144917
## Pulses.Nuts.and.Oilseeds -0.40988882 -0.67214885 -0.59519381 -0.6238357
## Fruits.and.Vegetables    -0.06393465 -0.07329308 -0.16392249 -0.3997753
##                                 Fish     Cereals Starchy.Foods
## Red.Meat                  0.06491072 -0.50970337     0.1538367
## White.Meat               -0.19719960 -0.43941908     0.3345677
## Egg                       0.04780844 -0.70131040     0.4126633
## Milk                      0.16246239 -0.59249246     0.2144917
## Fish                      1.00000000 -0.51714759     0.4386841
## Cereals                  -0.51714759  1.00000000    -0.5781345
## Starchy.Foods             0.43868411 -0.57813449     1.0000000
## Pulses.Nuts.and.Oilseeds -0.12226043  0.63605948    -0.4951880
## Fruits.and.Vegetables     0.22948842  0.04229293     0.0683567
##                          Pulses.Nuts.and.Oilseeds Fruits.and.Vegetables
## Red.Meat                               -0.4098888           -0.06393465
## White.Meat                             -0.6721488           -0.07329308
## Egg                                    -0.5951938           -0.16392249
## Milk                                   -0.6238357           -0.39977527
## Fish                                   -0.1222604            0.22948842
## Cereals                                 0.6360595            0.04229293
## Starchy.Foods                          -0.4951880            0.06835670
## Pulses.Nuts.and.Oilseeds                1.0000000            0.35133227
## Fruits.and.Vegetables                   0.3513323            1.00000000
corrplot(corrm.Protein_Consumption,method="number")

fit.bfc <- principal(Protein_Consumption[c(-1,-11)], nfactors=4, rotate="varimax")
fit.bfc
## Principal Components Analysis
## Call: principal(r = Protein_Consumption[c(-1, -11)], nfactors = 4, 
##     rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                            RC3   RC1   RC2   RC4   h2    u2 com
## Red.Meat                  0.08  0.92  0.01  0.02 0.86 0.138 1.0
## White.Meat                0.94  0.14 -0.08 -0.01 0.91 0.086 1.1
## Egg                       0.59  0.66  0.13 -0.09 0.81 0.193 2.1
## Milk                      0.20  0.68  0.21 -0.51 0.81 0.188 2.3
## Fish                     -0.21  0.10  0.92  0.09 0.91 0.089 1.2
## Cereals                  -0.42 -0.56 -0.61  0.07 0.87 0.133 2.8
## Starchy.Foods             0.52  0.01  0.71  0.03 0.77 0.226 1.8
## Pulses.Nuts.and.Oilseeds -0.69 -0.34 -0.28  0.41 0.83 0.166 2.6
## Fruits.and.Vegetables    -0.05 -0.04  0.14  0.95 0.93 0.071 1.1
## 
##                        RC3  RC1  RC2  RC4
## SS loadings           2.25 2.21 1.89 1.36
## Proportion Var        0.25 0.25 0.21 0.15
## Cumulative Var        0.25 0.50 0.71 0.86
## Proportion Explained  0.29 0.29 0.25 0.18
## Cumulative Proportion 0.29 0.58 0.82 1.00
## 
## Mean item complexity =  1.8
## Test of the hypothesis that 4 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.05 
##  with the empirical chi square  4.94  with prob <  0.55 
## 
## Fit based upon off diagonal values = 0.98
# Note that here, we are excluding/ignoring the column 'total', because it is just having the summation of all the column values

# We are considering 4 factors, because of VSS (Very Simple Structure)
round(fit.bfc$values, 3)
## [1] 4.096 1.625 1.085 0.905 0.427 0.347 0.270 0.135 0.111
# Loadings
fit.bfc$loadings
## 
## Loadings:
##                          RC3    RC1    RC2    RC4   
## Red.Meat                         0.925              
## White.Meat                0.941  0.142              
## Egg                       0.594  0.655  0.128       
## Milk                      0.197  0.684  0.208 -0.513
## Fish                     -0.214         0.921       
## Cereals                  -0.418 -0.557 -0.614       
## Starchy.Foods             0.518         0.711       
## Pulses.Nuts.and.Oilseeds -0.688 -0.337 -0.277  0.413
## Fruits.and.Vegetables                   0.144  0.951
## 
##                  RC3   RC1   RC2   RC4
## SS loadings    2.249 2.207 1.895 1.361
## Proportion Var 0.250 0.245 0.211 0.151
## Cumulative Var 0.250 0.495 0.706 0.857
# Factor Loadings: This table displays the factor loadings for each variable. 
# It represents the correlation between each variable and each factor. 

# Communalities
fit.bfc$communality
##                 Red.Meat               White.Meat                      Egg 
##                0.8623000                0.9136165                0.8067014 
##                     Milk                     Fish                  Cereals 
##                0.8123492                0.9110157                0.8666324 
##            Starchy.Foods Pulses.Nuts.and.Oilseeds    Fruits.and.Vegetables 
##                0.7744128                0.8343631                0.9293892
# Communalities are estimates of the variance in each observed variable that can be 
# explained by the extracted factors.
# It shows the proportion of each variable's variance that can be explained by the 
# factors, e.g., Red Meat explains 0.862, White Meat explains 0.913, and so on.

# Rotated factor scores
head(fit.bfc$scores)
##             RC3         RC1        RC2         RC4
## [1,] -1.9894396 -0.15885917 -1.3822473 -0.74679784
## [2,]  1.4868236 -0.02157573 -0.6102314 -0.11610160
## [3,]  0.4131939  0.75269512  0.3257944  0.20157345
## [4,] -0.6801210 -0.67812717 -1.7099907  0.01919653
## [5,]  1.0805053 -0.48828520 -0.3651871  0.01149907
## [6,]  0.2912014  0.38648634  1.1836494 -1.20111092
round(fit.bfc$values,3)
## [1] 4.096 1.625 1.085 0.905 0.427 0.347 0.270 0.135 0.111
# Factor recommendation
fa.parallel(Protein_Consumption[c(-1,-11)])

## Parallel analysis suggests that the number of factors =  1  and the number of components =  1
# From this, we can inference that 4th component could be the best choice for number of factors

# Correlations within Factors
fa.plot(fit.bfc)

# Visualizing the relationship
fa.diagram(fit.bfc)

# Factor recommendations for a simple structure
vss(Protein_Consumption[c(-1,-11)]) 
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## 
## Very Simple Structure
## Call: vss(x = Protein_Consumption[c(-1, -11)])
## VSS complexity 1 achieves a maximimum of 0.76  with  1  factors
## VSS complexity 2 achieves a maximimum of 0.87  with  2  factors
## 
## The Velicer MAP achieves a minimum of 0.08  with  1  factors 
## BIC achieves a minimum of  -40  with  1  factors
## Sample Size adjusted BIC achieves a minimum of  1.3  with  5  factors
## 
## Statistics by number of factors 
##   vss1 vss2   map dof   chisq prob sqresid  fit RMSEA   BIC SABIC complex
## 1 0.76 0.00 0.084  27 4.7e+01 0.01    5.31 0.76 0.167 -40.0  43.7     1.0
## 2 0.72 0.87 0.098  19 2.7e+01 0.11    2.90 0.87 0.121 -34.4  24.6     1.3
## 3 0.49 0.84 0.110  12 1.4e+01 0.27    1.71 0.92 0.081 -24.1  13.1     1.7
## 4 0.51 0.82 0.144   6 5.4e+00 0.49    0.78 0.96 0.000 -13.9   4.7     1.8
## 5 0.43 0.75 0.208   1 1.4e+00 0.23    0.67 0.97 0.123  -1.8   1.3     2.0
## 6 0.52 0.74 0.301  -3 4.9e-08   NA    0.45 0.98    NA    NA    NA     2.2
## 7 0.46 0.74 0.496  -6 4.7e-08   NA    0.42 0.98    NA    NA    NA     2.3
## 8 0.46 0.75 1.000  -8 0.0e+00   NA    0.40 0.98    NA    NA    NA     2.3
##    eChisq    SRMR eCRMS eBIC
## 1 3.9e+01 1.5e-01 0.169  -48
## 2 1.5e+01 9.1e-02 0.125  -46
## 3 5.3e+00 5.4e-02 0.094  -33
## 4 8.7e-01 2.2e-02 0.054  -18
## 5 2.4e-01 1.2e-02 0.070   -3
## 6 5.5e-09 1.8e-06    NA   NA
## 7 3.8e-09 1.5e-06    NA   NA
## 8 1.7e-19 9.8e-12    NA   NA
summary(vss(Protein_Consumption[c(-1,-11)]) )
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## An ultra-Heywood case was detected. Examine the results carefully

## 
## Very Simple Structure
## VSS complexity 1 achieves a maximimum of 0.76  with  1  factors
## VSS complexity 2 achieves a maximimum of 0.87  with  2  factors
## 
## The Velicer MAP criterion achieves a minimum of 1  with  1  factors
## 
# Overall, the factor analysis suggests that there are four underlying factors that 
# explain most of the variance in the dataset. These factors represent different aspects 
# of the European diet, such as a diet high in Milk, Egg and Fish (factor 1), a diet high 
# in fish, starchy food and cereals (factor 2), a diet high in white meat, pulses/nuts/oilseeds  
# (factor 3), and a diet high in fruits and vegetables (factor 4). 
# The factor scores for each country provide a way to compare countries based on their diet patterns and 
# identify similarities and differences between them.


# Factor Analysis is the method of identifying the latent relational structure among 
# a set of variables and then narrowing it down to a smaller number of variables.
# We can see that we have reduced the factors to five, this contains most of the 
# information in the dataset.
# The factors RC1, RC2, RC3, RC4 can be helpful in analyzing the entire dataset.